library(tidyverse)
library(magrittr)
library(parallel)
library(pander)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(edgeR)
library(DT)
library(ggrepel)
library(pheatmap)
library(ggdendro)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 1
load(
here::here("1_Psen2S4Ter/R/output/1_1_DE.RData")
)
jellyfish v2.3.0 was used to count kmers of trimmed fastq files that had been filtered for rRNA sequences. This was performed for 5 values: \(k = 5, 6, 7, 8, 9, 10\). Lower values of \(k\) lose specificity in comparison to higher values, however as \(k\) increases, the exponential increase of possible kmers causes limitations due to computational processing time.
k5files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k5", pattern = "_dumps.txt", full.names = TRUE)
k5counts <- lapply(k5files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k5dge <- k5counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k5dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k5dge$samples$group <- colnames(k5dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k5dist <- k5dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 5-mers")
k5labels <- k5dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k5heat <- k5dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "5-mer counts heatmap", labels_row = k5labels)
k5heat$tree_row$labels <- k5labels
k5den <- ggdendrogram(k5heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 5-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
k5pcaPlot <- k5pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k5dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 5"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k5pcaRrna <- k5pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k5dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 5"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k5design <- model.matrix(~rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k6files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k6", pattern = "_dumps.txt", full.names = TRUE)
k6counts <- lapply(k6files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k6dge <- k6counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k6dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k6dist <- k6dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 6-mers")
k6labels <- k6dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k6heat <- k6dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "6-mer counts heatmap", labels_row = k6labels)
k6heat$tree_row$labels <- k6labels
k6den <- ggdendrogram(k6heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 6-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k6pcaPlot <- k6pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k6dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 6"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k6pcaRrna <- k6pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k6dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 6"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k6design <- model.matrix(~rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k7files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k7", pattern = "_dumps.txt", full.names = TRUE)
k7counts <- lapply(k7files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k7dge <- k7counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k7dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k7dist <- k7dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 7-mers")
k7labels <- k7dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k7heat <- k7dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "7-mer counts heatmap", labels_row = k7labels)
k7heat$tree_row$labels <- k7labels
k7den <- ggdendrogram(k7heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 7-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k7pcaPlot <- k7pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k7dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 7"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k7pcaRrna <- k7pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k7dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 7"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k7design <- model.matrix(~rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k8files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k8", pattern = "_dumps.txt", full.names = TRUE)
k8counts <- lapply(k8files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k8dge <- k8counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k8dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k8dist <- k8dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 8-mers")
k8labels <- k8dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k8heat <- k8dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "8-mer counts heatmap", labels_row = k8labels)
k8heat$tree_row$labels <- k8labels
k8den <- ggdendrogram(k8heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 8-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k8pcaPlot <- k8pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k8dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 8"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k8pcaRrna <- k8pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k8dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 8"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k8design <- model.matrix(~rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k9files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k9", pattern = "_dumps.txt", full.names = TRUE)
k9counts <- lapply(k9files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k9dge <- k9counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k9dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k9dist <- k9dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 9-mers")
k9labels <- k9dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k9heat <- k9dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "9-mer counts heatmap", labels_row = k9labels)
k9heat$tree_row$labels <- k9labels
k9den <- ggdendrogram(k9heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 9-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k9pcaPlot <- k9pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k9dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 9"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k9pcaRrna <- k9pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k9dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 9"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k9design <- model.matrix(~rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k10files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k10", pattern = "_dumps.txt", full.names = TRUE)
k10counts <- lapply(k10files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k10dge <- k10counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k10dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k10dist <- k10dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 10-mers")
k10labels <- k10dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k10heat <- k10dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "10-mer counts heatmap", labels_row = k10labels)
k10heat$tree_row$labels <- k10labels
k10den <- ggdendrogram(k10heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 10-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k10pcaPlot <- k10pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k10dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 10"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k10pcaRrna <- k10pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k10dge$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = group), alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
y = "rRNA proportion",
colour = "Genotype",
title = "k = 10"
) +
scale_y_continuous(labels = percent) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k10design <- model.matrix(~rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(BY = p.adjust(P.Value, "BY")) %>%
mutate(DE = BY < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
BY,
t,
DE,
everything(),
-B
) %>%
as_tibble()
ggarrange(
k5dist, k6dist, k7dist, k8dist, k9dist, k10dist,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
Distributions of kmer counts for each value of \(k\)
ggarrange(
k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.
ggarrange(
k5pcaRrna, k6pcaRrna, k7pcaRrna, k8pcaRrna, k9pcaRrna, k10pcaRrna,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.
ggarrange(
k5den, k6den, k7den, k8den, k9den, k10den,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
Hierarchical clustering of samples based on kmer counts. As \(k\) increases, WT and mutant groups start to cluster.
topRes <- function(x, cap){
x %>%
dplyr::select(mer, AveExpr, logFC, P.Value, FDR, BY, DE) %>%
mutate(
AveExpr = format(round(AveExpr, 2), nsmall = 2),
logFC = format(round(logFC, 2), nsmall = 2),
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR),
BY = sprintf("%.2e", BY)
) %>%
dplyr::slice(1:100) %>%
datatable(
options = list(pageLength = 10),
class = "striped hover condensed responsive",
filter = "top",
caption = cap
)
}
topRes(k5topTable,
cap = paste(
"The top 100 differentially expressed 5-mers.",
nrow(dplyr::filter(k5topTable, DE)),
"of",
nrow(k5topTable),
"detected sequences were classified as DE with BY p-value < 0.05."
)
)
k5topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 5")
Histogram of p-values. Values follow the expected distribution when there are many differences.
k5topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
# dplyr::filter(-log10(P.Value) > 4 | logFC > 4 | logFC < -2),
dplyr::filter(-log10(P.Value) > 4 | logFC > 3 | logFC < -2.5),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 5") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
topRes(k6topTable,
cap = paste(
"The top 100 differentially expressed 6-mers.",
nrow(dplyr::filter(k6topTable, DE)),
"of",
nrow(k6topTable),
"detected sequences were classified as DE with a BY p-value < 0.05."
)
)
k6topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 6")
Histogram of p-values. Values follow the expected distribution when there are some differences.
k6topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(-log10(P.Value) > 6 | logFC > 6 | logFC < -2.3),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 6") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
topRes(k7topTable,
cap = paste(
"The top 100 differentially expressed 7-mers.",
nrow(dplyr::filter(k7topTable, DE)),
"of",
nrow(k7topTable),
"detected sequences were classified as DE with a BY p-value < 0.05."
)
)
k7topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 7")
Histogram of p-values. Values follow the expected distribution when there are many differences.
k7topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(-log10(P.Value) > 6.4 | logFC > 10 | logFC < -5),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 7") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
topRes(k8topTable,
cap = paste(
"The top 100 differentially expressed 8-mers.",
nrow(dplyr::filter(k8topTable, DE)),
"of",
nrow(k8topTable),
"detected sequences were classified as DE with a BY p-value < 0.05."
)
)
k8topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 8")
Histogram of p-values. Values follow the expected distribution when there are many differences.
k8topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(-log10(P.Value) > 7.2 | logFC > 12.5 | logFC < -8),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 8") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
topRes(k9topTable,
cap = paste(
"The top 100 differentially expressed 9-mers.",
nrow(dplyr::filter(k9topTable, DE)),
"of",
nrow(k9topTable),
"detected sequences were classified as DE with a BY p-value < 0.05."
)
)
k9topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 9")
Histogram of p-values. Values follow the expected distribution when there are many differences.
k9topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE & -log10(P.Value) > 7.5 | logFC < -10 | logFC > 15),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 9") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
topRes(k10topTable,
cap = paste(
"The top 100 differentially expressed 10-mers.",
nrow(dplyr::filter(k10topTable, DE)),
"of",
nrow(k10topTable),
"detected sequences were classified as DE with a BY p-value < 0.05."
)
)
k10topTable %>%
ggplot(aes(P.Value)) +
geom_histogram(
binwidth = 0.02,
colour = "black", fill = "grey90"
) +
labs(title = "k = 10")
Histogram of p-values. Values follow the expected distribution when there are many differences.
k10topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 10") +
theme_bw() +
theme(legend.position = "none")
Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.
save(
k5topTable,
k6topTable,
k7topTable,
k8topTable,
k9topTable,
k10topTable,
file = here::here(
"1_Psen2S4Ter/R/output/1_2_kmer.RData"
)
)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggdendro_0.1.22 pheatmap_1.0.12 ggrepel_0.8.2 DT_0.14
## [5] edgeR_3.30.3 limma_3.44.3 kableExtra_1.1.0 ggpubr_0.4.0
## [9] scales_1.1.1 here_0.1 pander_0.6.3 magrittr_1.5
## [13] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4
## [17] readr_1.3.1 tidyr_1.1.0 tibble_3.0.3 ggplot2_3.3.2
## [21] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-149 fs_1.4.2 lubridate_1.7.9 webshot_0.5.2
## [5] RColorBrewer_1.1-2 httr_1.4.1 rprojroot_1.3-2 tools_4.0.3
## [9] backports_1.1.8 R6_2.4.1 mgcv_1.8-33 DBI_1.1.0
## [13] colorspace_1.4-1 withr_2.2.0 tidyselect_1.1.0 gridExtra_2.3
## [17] curl_4.3 compiler_4.0.3 cli_2.0.2 rvest_0.3.5
## [21] xml2_1.3.2 labeling_0.3 digest_0.6.25 foreign_0.8-80
## [25] rmarkdown_2.3 rio_0.5.16 pkgconfig_2.0.3 htmltools_0.5.0
## [29] dbplyr_1.4.4 highr_0.8 htmlwidgets_1.5.1 rlang_0.4.7
## [33] readxl_1.3.1 rstudioapi_0.11 generics_0.0.2 farver_2.0.3
## [37] jsonlite_1.7.0 crosstalk_1.1.0.1 zip_2.0.4 car_3.0-8
## [41] Matrix_1.2-18 Rcpp_1.0.5 munsell_0.5.0 fansi_0.4.1
## [45] abind_1.4-5 lifecycle_0.2.0 stringi_1.4.6 yaml_2.2.1
## [49] carData_3.0-4 MASS_7.3-53 grid_4.0.3 blob_1.2.1
## [53] crayon_1.3.4 lattice_0.20-41 splines_4.0.3 haven_2.3.1
## [57] cowplot_1.0.0 hms_0.5.3 locfit_1.5-9.4 knitr_1.29
## [61] pillar_1.4.6 ggsignif_0.6.0 reprex_0.3.0 glue_1.4.1
## [65] evaluate_0.14 data.table_1.12.8 modelr_0.1.8 vctrs_0.3.2
## [69] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1 xfun_0.15
## [73] openxlsx_4.1.5 broom_0.7.0 rstatix_0.6.0 viridisLite_0.3.0
## [77] ellipsis_0.3.1